Introduction

The dataset

The Breast Cancer (Wisconsin) Diagnosis dataset contains the diagnosis and a set of 30 features describing the characteristics of the cell nuclei present in the digitized image of a of a fine needle aspirate (FNA) of a breast mass.

Ten real-valued features are computed for each cell nucleus:

The mean, standard error (SE) and “worst” or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. We will analyze the features to understand the predictive value for diagnosis. We will then create models using two different algorithms and use the models to predict the diagnosis.

suppressMessages(library(ggplot2))
suppressMessages(library(GGally))
suppressMessages(library(dplyr))
suppressMessages(library(DataExplorer))
suppressMessages(library(funModeling))

1) IMPORTING DATASET

wbcd <- read.csv("data.csv")
head(wbcd,10)
##          id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1    842302         M       17.99        10.38         122.80    1001.0
## 2    842517         M       20.57        17.77         132.90    1326.0
## 3  84300903         M       19.69        21.25         130.00    1203.0
## 4  84348301         M       11.42        20.38          77.58     386.1
## 5  84358402         M       20.29        14.34         135.10    1297.0
## 6    843786         M       12.45        15.70          82.57     477.1
## 7    844359         M       18.25        19.98         119.60    1040.0
## 8  84458202         M       13.71        20.83          90.20     577.9
## 9    844981         M       13.00        21.82          87.50     519.8
## 10 84501001         M       12.46        24.04          83.97     475.9
##    smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1          0.11840          0.27760        0.30010             0.14710
## 2          0.08474          0.07864        0.08690             0.07017
## 3          0.10960          0.15990        0.19740             0.12790
## 4          0.14250          0.28390        0.24140             0.10520
## 5          0.10030          0.13280        0.19800             0.10430
## 6          0.12780          0.17000        0.15780             0.08089
## 7          0.09463          0.10900        0.11270             0.07400
## 8          0.11890          0.16450        0.09366             0.05985
## 9          0.12730          0.19320        0.18590             0.09353
## 10         0.11860          0.23960        0.22730             0.08543
##    symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1         0.2419                0.07871    1.0950     0.9053        8.589
## 2         0.1812                0.05667    0.5435     0.7339        3.398
## 3         0.2069                0.05999    0.7456     0.7869        4.585
## 4         0.2597                0.09744    0.4956     1.1560        3.445
## 5         0.1809                0.05883    0.7572     0.7813        5.438
## 6         0.2087                0.07613    0.3345     0.8902        2.217
## 7         0.1794                0.05742    0.4467     0.7732        3.180
## 8         0.2196                0.07451    0.5835     1.3770        3.856
## 9         0.2350                0.07389    0.3063     1.0020        2.406
## 10        0.2030                0.08243    0.2976     1.5990        2.039
##    area_se smoothness_se compactness_se concavity_se concave.points_se
## 1   153.40      0.006399        0.04904      0.05373           0.01587
## 2    74.08      0.005225        0.01308      0.01860           0.01340
## 3    94.03      0.006150        0.04006      0.03832           0.02058
## 4    27.23      0.009110        0.07458      0.05661           0.01867
## 5    94.44      0.011490        0.02461      0.05688           0.01885
## 6    27.19      0.007510        0.03345      0.03672           0.01137
## 7    53.91      0.004314        0.01382      0.02254           0.01039
## 8    50.96      0.008805        0.03029      0.02488           0.01448
## 9    24.32      0.005731        0.03502      0.03553           0.01226
## 10   23.94      0.007149        0.07217      0.07743           0.01432
##    symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1      0.03003             0.006193        25.38         17.33          184.60
## 2      0.01389             0.003532        24.99         23.41          158.80
## 3      0.02250             0.004571        23.57         25.53          152.50
## 4      0.05963             0.009208        14.91         26.50           98.87
## 5      0.01756             0.005115        22.54         16.67          152.20
## 6      0.02165             0.005082        15.47         23.75          103.40
## 7      0.01369             0.002179        22.88         27.66          153.20
## 8      0.01486             0.005412        17.06         28.14          110.60
## 9      0.02143             0.003749        15.49         30.73          106.20
## 10     0.01789             0.010080        15.09         40.68           97.65
##    area_worst smoothness_worst compactness_worst concavity_worst
## 1      2019.0           0.1622            0.6656          0.7119
## 2      1956.0           0.1238            0.1866          0.2416
## 3      1709.0           0.1444            0.4245          0.4504
## 4       567.7           0.2098            0.8663          0.6869
## 5      1575.0           0.1374            0.2050          0.4000
## 6       741.6           0.1791            0.5249          0.5355
## 7      1606.0           0.1442            0.2576          0.3784
## 8       897.0           0.1654            0.3682          0.2678
## 9       739.3           0.1703            0.5401          0.5390
## 10      711.4           0.1853            1.0580          1.1050
##    concave.points_worst symmetry_worst fractal_dimension_worst  X
## 1                0.2654         0.4601                 0.11890 NA
## 2                0.1860         0.2750                 0.08902 NA
## 3                0.2430         0.3613                 0.08758 NA
## 4                0.2575         0.6638                 0.17300 NA
## 5                0.1625         0.2364                 0.07678 NA
## 6                0.1741         0.3985                 0.12440 NA
## 7                0.1932         0.3063                 0.08368 NA
## 8                0.1556         0.3196                 0.11510 NA
## 9                0.2060         0.4378                 0.10720 NA
## 10               0.2210         0.4366                 0.20750 NA
dim(wbcd)
## [1] 569  33
str(wbcd)
## 'data.frame':    569 obs. of  33 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ X                      : logi  NA NA NA NA NA NA ...

2) DATA CLEANING & PRE-PROCESSING

Distribution of different attributes

plot_num(wbcd %>% select(-id), bins=10)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

wbcd$X <- NULL
wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))
head(wbcd,10)
##    diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1  Malignant       17.99        10.38         122.80    1001.0         0.11840
## 2  Malignant       20.57        17.77         132.90    1326.0         0.08474
## 3  Malignant       19.69        21.25         130.00    1203.0         0.10960
## 4  Malignant       11.42        20.38          77.58     386.1         0.14250
## 5  Malignant       20.29        14.34         135.10    1297.0         0.10030
## 6  Malignant       12.45        15.70          82.57     477.1         0.12780
## 7  Malignant       18.25        19.98         119.60    1040.0         0.09463
## 8  Malignant       13.71        20.83          90.20     577.9         0.11890
## 9  Malignant       13.00        21.82          87.50     519.8         0.12730
## 10 Malignant       12.46        24.04          83.97     475.9         0.11860
##    compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1           0.27760        0.30010             0.14710        0.2419
## 2           0.07864        0.08690             0.07017        0.1812
## 3           0.15990        0.19740             0.12790        0.2069
## 4           0.28390        0.24140             0.10520        0.2597
## 5           0.13280        0.19800             0.10430        0.1809
## 6           0.17000        0.15780             0.08089        0.2087
## 7           0.10900        0.11270             0.07400        0.1794
## 8           0.16450        0.09366             0.05985        0.2196
## 9           0.19320        0.18590             0.09353        0.2350
## 10          0.23960        0.22730             0.08543        0.2030
##    fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1                 0.07871    1.0950     0.9053        8.589  153.40
## 2                 0.05667    0.5435     0.7339        3.398   74.08
## 3                 0.05999    0.7456     0.7869        4.585   94.03
## 4                 0.09744    0.4956     1.1560        3.445   27.23
## 5                 0.05883    0.7572     0.7813        5.438   94.44
## 6                 0.07613    0.3345     0.8902        2.217   27.19
## 7                 0.05742    0.4467     0.7732        3.180   53.91
## 8                 0.07451    0.5835     1.3770        3.856   50.96
## 9                 0.07389    0.3063     1.0020        2.406   24.32
## 10                0.08243    0.2976     1.5990        2.039   23.94
##    smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1       0.006399        0.04904      0.05373           0.01587     0.03003
## 2       0.005225        0.01308      0.01860           0.01340     0.01389
## 3       0.006150        0.04006      0.03832           0.02058     0.02250
## 4       0.009110        0.07458      0.05661           0.01867     0.05963
## 5       0.011490        0.02461      0.05688           0.01885     0.01756
## 6       0.007510        0.03345      0.03672           0.01137     0.02165
## 7       0.004314        0.01382      0.02254           0.01039     0.01369
## 8       0.008805        0.03029      0.02488           0.01448     0.01486
## 9       0.005731        0.03502      0.03553           0.01226     0.02143
## 10      0.007149        0.07217      0.07743           0.01432     0.01789
##    fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1              0.006193        25.38         17.33          184.60     2019.0
## 2              0.003532        24.99         23.41          158.80     1956.0
## 3              0.004571        23.57         25.53          152.50     1709.0
## 4              0.009208        14.91         26.50           98.87      567.7
## 5              0.005115        22.54         16.67          152.20     1575.0
## 6              0.005082        15.47         23.75          103.40      741.6
## 7              0.002179        22.88         27.66          153.20     1606.0
## 8              0.005412        17.06         28.14          110.60      897.0
## 9              0.003749        15.49         30.73          106.20      739.3
## 10             0.010080        15.09         40.68           97.65      711.4
##    smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1            0.1622            0.6656          0.7119               0.2654
## 2            0.1238            0.1866          0.2416               0.1860
## 3            0.1444            0.4245          0.4504               0.2430
## 4            0.2098            0.8663          0.6869               0.2575
## 5            0.1374            0.2050          0.4000               0.1625
## 6            0.1791            0.5249          0.5355               0.1741
## 7            0.1442            0.2576          0.3784               0.1932
## 8            0.1654            0.3682          0.2678               0.1556
## 9            0.1703            0.5401          0.5390               0.2060
## 10           0.1853            1.0580          1.1050               0.2210
##    symmetry_worst fractal_dimension_worst
## 1          0.4601                 0.11890
## 2          0.2750                 0.08902
## 3          0.3613                 0.08758
## 4          0.6638                 0.17300
## 5          0.2364                 0.07678
## 6          0.3985                 0.12440
## 7          0.3063                 0.08368
## 8          0.3196                 0.11510
## 9          0.4378                 0.10720
## 10         0.4366                 0.20750
summary(wbcd)
##      diagnosis    radius_mean      texture_mean   perimeter_mean  
##  Benign   :357   Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  Malignant:212   1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##                  Median :13.370   Median :18.84   Median : 86.24  
##                  Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##                  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##                  Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean   radius_se     
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996        Min.   :0.1115  
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770        1st Qu.:0.2324  
##  Median :0.03350     Median :0.1792   Median :0.06154        Median :0.3242  
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280        Mean   :0.4052  
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612        3rd Qu.:0.4789  
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744        Max.   :2.8730  
##    texture_se      perimeter_se       area_se        smoothness_se     
##  Min.   :0.3602   Min.   : 0.757   Min.   :  6.802   Min.   :0.001713  
##  1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850   1st Qu.:0.005169  
##  Median :1.1080   Median : 2.287   Median : 24.530   Median :0.006380  
##  Mean   :1.2169   Mean   : 2.866   Mean   : 40.337   Mean   :0.007041  
##  3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190   3rd Qu.:0.008146  
##  Max.   :4.8850   Max.   :21.980   Max.   :542.200   Max.   :0.031130  
##  compactness_se      concavity_se     concave.points_se   symmetry_se      
##  Min.   :0.002252   Min.   :0.00000   Min.   :0.000000   Min.   :0.007882  
##  1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638   1st Qu.:0.015160  
##  Median :0.020450   Median :0.02589   Median :0.010930   Median :0.018730  
##  Mean   :0.025478   Mean   :0.03189   Mean   :0.011796   Mean   :0.020542  
##  3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710   3rd Qu.:0.023480  
##  Max.   :0.135400   Max.   :0.39600   Max.   :0.052790   Max.   :0.078950  
##  fractal_dimension_se  radius_worst   texture_worst   perimeter_worst 
##  Min.   :0.0008948    Min.   : 7.93   Min.   :12.02   Min.   : 50.41  
##  1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11  
##  Median :0.0031870    Median :14.97   Median :25.41   Median : 97.66  
##  Mean   :0.0037949    Mean   :16.27   Mean   :25.68   Mean   :107.26  
##  3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40  
##  Max.   :0.0298400    Max.   :36.04   Max.   :49.54   Max.   :251.20  
##    area_worst     smoothness_worst  compactness_worst concavity_worst 
##  Min.   : 185.2   Min.   :0.07117   Min.   :0.02729   Min.   :0.0000  
##  1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145  
##  Median : 686.5   Median :0.13130   Median :0.21190   Median :0.2267  
##  Mean   : 880.6   Mean   :0.13237   Mean   :0.25427   Mean   :0.2722  
##  3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829  
##  Max.   :4254.0   Max.   :0.22260   Max.   :1.05800   Max.   :1.2520  
##  concave.points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.09993      Median :0.2822   Median :0.08004        
##  Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.29100      Max.   :0.6638   Max.   :0.20750
wbcd %>% plot_missing()

3) VISUALIZATIONS

Malignant and Benign diagnosis barplot

#Insight into Breast Cancer Wisconsin- Data
ggplot(data = wbcd, aes(x = diagnosis, fill = diagnosis)) +
geom_bar()+
geom_text(stat='count', aes(label=..count..), vjust=-1) +
labs(title = 'Diagnosis of Breast Cancer',
     subtitle = 'Most of the diagnosis (63%) are Benign',
     caption = 'Data owned by the University of Wisconsin',
     x = 'Diagnosis', y = 'Number of observations')

Scatterplot

Mean Perimeter and Mean Radius

Over here, we will be using the mean perimeter and the mean radius observed from the center of the lump to the perimeter. This will reveal how both types of lumps look in relative size.

ggplot(data = wbcd, 
       aes(x = radius_mean, y = perimeter_mean, color = diagnosis)) +
  geom_point() +
  geom_hline(yintercept = 116.0, linetype = 'dashed', color = 'gray')+
  geom_vline(xintercept = 18.00, linetype = 'dashed', color = 'gray')+
  labs(title = 'Mean Perimeter and Mean Radius',
       subtitle = 'Malignant lumps can get relatively bigger than benigns',
       caption = 'Data owned by the University of Wisconsin',
       x = 'Mean Radius', y = 'Mean Perimeter') +
  annotate('text', x = 24, y = 150, 
           label = '45% of malignants are bigger than every observed benign',
           size = 2.3, angle = 45)

Insights: Malignant lumps can get relatively bigger than benign lumps. This has the possibility of sparking up a hypothesis that malignant lumps begin as benigns.

Mean Texture and Smoothess of Lumps

ggplot(data = wbcd, 
       aes(x = texture_mean, y = smoothness_mean, color = diagnosis)) +
  geom_point()+
  geom_vline(xintercept =  18.84, linetype = 'dashed', color = 'gray') +
  labs(title = 'Mean Texture and Smoothess of Lumps',
       subtitle = 'Most benigns (66%) are below the median mean texture',
       caption = 'Data owned by the University of Wisconsin',
       x = 'Mean Texture', y = 'Mean Smoothness') +
  annotate('text', label = 'median = 18.84', x = 22, y = 0.160,
           size = 2.5)

Insights from Texture and Smoothness Visualization

Not a lot of variation can be seen in the mean smoothness of both diagnosis as they all seem to clustered from the bottom to the upper midsection of the plot. However we can observe that most of the malignants (66%) are skewed to the right side of the median. This connotes that malignant lumps display higher texture variation values than benigns.

Compactness and Concavity

ggplot(data = wbcd, 
       aes(x = compactness_mean, y = concavity_mean, color = diagnosis)) +
  geom_point()+
  geom_smooth() +
  labs(title = 'Mean Compactness and Mean Concavity',
       subtitle = 'Most benigns display less concavity and compactness',
       caption = 'Data owned by the University of Wisconsin',
       x = 'Mean Compactness', y = 'Mean Concavity')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Insight from Compactness and Concavity

There is a clear display of outliers within the data. However a visual analysis reveals that benign lumps tend to have low mean concavity and a low mean compactness. This can is manifested in the benigns being skewed towards the bottom left side of the graph. Notice that the malignants are displaying a wider range from low concavity and low compactness to high concavity and high compactness. This visualization suggests that benigns usually have low to medium severe concaves at the contours of the lumps however malignant lumps can display anywhere between low and very high concavity and compactness.

4) ANALYSING THE CORRELATION B/W VARIABLES

Correlation between each variables

(a)MEAN

ggpairs(wbcd[,c(2:11)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))

#### (b)STANDARD ERROR

ggpairs(wbcd[,c(12:21)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))

(c)WORST

ggpairs(wbcd[,c(22:31)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))

Viewing correlation between different variables using ggcorr funtion

(a) MEAN

ggcorr(wbcd[,c(2:11)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

(b)STANDARD ERROR

ggcorr(wbcd[,c(12:21)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

(c)WORST

ggcorr(wbcd[,c(22:31)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

5)PRINCIPAL COMPONENT ANALYSIS (PCA)

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
wbcd_pca <- transform(wbcd) 

All

The cumulative proportion from PC1 to PC6 is about 88.7%. (above 85%) It means that PC1~PC6 can explain 88.7% of the whole data.

all_pca <- prcomp(wbcd_pca[,-1], cor=TRUE, scale = TRUE)
## Warning: In prcomp.default(wbcd_pca[, -1], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000

Mean

The cumulative proportion from PC1 to PC3 is about 88.7%. (above 85%)

mean_pca <- prcomp(wbcd_pca[,c(2:11)], scale = TRUE)
summary(mean_pca)
## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.3406 1.5870 0.93841 0.7064 0.61036 0.35234 0.28299
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241 0.00801
## Cumulative Proportion  0.5479 0.7997 0.88779 0.9377 0.97495 0.98736 0.99537
##                            PC8     PC9    PC10
## Standard deviation     0.18679 0.10552 0.01680
## Proportion of Variance 0.00349 0.00111 0.00003
## Cumulative Proportion  0.99886 0.99997 1.00000

SE

The cumulative proportion from PC1 to PC4 is about 86.7%. (above 85%)

se_pca <- prcomp(wbcd_pca[,c(12:21)], scale = TRUE)
summary(se_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1779 1.4406 1.1245 0.77095 0.75991 0.57939 0.43512
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357 0.01893
## Cumulative Proportion  0.4743 0.6819 0.8083 0.86774 0.92548 0.95905 0.97798
##                           PC8     PC9    PC10
## Standard deviation     0.3962 0.20436 0.14635
## Proportion of Variance 0.0157 0.00418 0.00214
## Cumulative Proportion  0.9937 0.99786 1.00000

Worst

The cumulative proportion from PC1 to PC3 is about 85.8%. (above 85%)

worst_pca <- prcomp(wbcd_pca[,c(22:31)], scale = TRUE)
summary(worst_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3869 1.4443 0.89597 0.73531 0.71741 0.42862 0.28959
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837 0.00839
## Cumulative Proportion  0.5697 0.7783 0.85860 0.91267 0.96413 0.98251 0.99089
##                            PC8     PC9    PC10
## Standard deviation     0.26802 0.12343 0.06326
## Proportion of Variance 0.00718 0.00152 0.00040
## Cumulative Proportion  0.99808 0.99960 1.00000

SCREE PLOTS

All

Line lies at point PC6

screeplot(all_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

fviz_eig(all_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer All Variances - PCA",
         x = "Principal Components", y = "% of variances")

Mean

Line lies at point PC4

screeplot(mean_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

fviz_eig(mean_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer Mean Variances - PCA",
         x = "Principal Components", y = "% of variances")

Standard Error

Line lies at point PC4

screeplot(se_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

fviz_eig(se_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer SE Variances - PCA",
         x = "Principal Components", y = "% of variances")

Worst

Line lies at point PC4

screeplot(worst_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

fviz_eig(worst_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer Worst Variances - PCA",
         x = "Principal Components", y = "% of variances")

GET PCA VARIABLES

all_var <- get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
Quality of representation of PCA

Correlation between variables and PCA

library("corrplot")
## corrplot 0.92 loaded
corrplot(all_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(all_var$contrib, is.corr=FALSE)    

Contributions of variables to PC1 & PC2
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

Mean

Get PCA Variables
mean_var <- get_pca_var(mean_pca)
mean_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
Quality of representation of PCA

Correlation between variables and PCA

library("corrplot")
corrplot(mean_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(mean_var$contrib, is.corr=FALSE)   

Contributions of variables to PC1 & PC2
library(gridExtra)
p1 <- fviz_contrib(mean_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(mean_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

SE

Get PCA Variables
se_var <- get_pca_var(se_pca)
se_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
Quality of representation of PCA

Correlation between variables and PCA

library("corrplot")
corrplot(se_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(se_var$contrib, is.corr=FALSE) 

Contributions of variables to PC1 & PC2
library(gridExtra)
p1 <- fviz_contrib(se_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(se_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

Worst

Get PCA Variables
worst_var <- get_pca_var(worst_pca)
worst_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
Quality of representation of PCA

Correlation between variables and PCA

library("corrplot")
corrplot(worst_var$cos2, is.corr=FALSE)

Contributions of variables to PCA

To highlight the most contributing variables for each components

corrplot(worst_var$contrib, is.corr=FALSE)  

Contributions of variables to PC1 & PC2
library(gridExtra)
p1 <- fviz_contrib(worst_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(worst_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

BIPLOTS

All

fviz_pca_biplot(all_pca, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Mean

fviz_pca_biplot(mean_pca, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

SE

fviz_pca_biplot(se_pca, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

Worst

fviz_pca_biplot(worst_pca, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

6)MODEL BUILDING

test & train dataset for testing classification ML methods train dataset(70%), test dataset(30%)

nrows <- NROW(wbcd)
set.seed(218)                           ## fix random value
index <- sample(1:nrows, 0.7 * nrows)   ## shuffle and divide

#train <- wbcd                          ## 569 test data (100%)
train <- wbcd[index,]                   ## 398 test data (70%)
test <- wbcd[-index,]                   ## 171 test data (30%)

prop.table(table(train$diagnosis)) #proportion of diagnosis (Benign / Malignant) 
## 
##    Benign Malignant 
## 0.6180905 0.3819095
prop.table(table(test$diagnosis))
## 
##    Benign Malignant 
## 0.6491228 0.3508772

APPLYING ML MODELS

naiveBayes

library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)        
cm_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       107         6
##   Malignant      4        54
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8706          
##                                           
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.9640          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.9469          
##          Neg Pred Value : 0.9310          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6257          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.9320          
##                                           
##        'Positive' Class : Benign          
## 

randomForest

library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
learn_rf <- randomForest(diagnosis~., data=train, ntree=500, proximity=T, importance=T)
pre_rf   <- predict(learn_rf, test[,-1])
cm_rf    <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       111         4
##   Malignant      0        56
##                                           
##                Accuracy : 0.9766          
##                  95% CI : (0.9412, 0.9936)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9478          
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9333          
##          Pos Pred Value : 0.9652          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6491          
##    Detection Prevalence : 0.6725          
##       Balanced Accuracy : 0.9667          
##                                           
##        'Positive' Class : Benign          
## 

rpart

library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp  <- confusionMatrix(pre_rp, test$diagnosis)   
cm_rp
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       108         5
##   Malignant      3        55
##                                           
##                Accuracy : 0.9532          
##                  95% CI : (0.9099, 0.9796)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8965          
##                                           
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.9730          
##             Specificity : 0.9167          
##          Pos Pred Value : 0.9558          
##          Neg Pred Value : 0.9483          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6316          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.9448          
##                                           
##        'Positive' Class : Benign          
## 

AdaBoost

library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       110         2
##   Malignant      1        58
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9613          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9910          
##             Specificity : 0.9667          
##          Pos Pred Value : 0.9821          
##          Neg Pred Value : 0.9831          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6433          
##    Detection Prevalence : 0.6550          
##       Balanced Accuracy : 0.9788          
##                                           
##        'Positive' Class : Benign          
## 

SVM

learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       109         1
##   Malignant      2        59
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9616          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9820          
##             Specificity : 0.9833          
##          Pos Pred Value : 0.9909          
##          Neg Pred Value : 0.9672          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6374          
##    Detection Prevalence : 0.6433          
##       Balanced Accuracy : 0.9827          
##                                           
##        'Positive' Class : Benign          
## 

SVM-Tune

gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma)    ## 231

acc_test <- numeric()
accuracy1 <- NULL; accuracy2 <- NULL

for(i in 1:NROW(parms)){        
        learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
        pre_svm <- predict(learn_svm, test[,-1])
        accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
}

acc <- data.frame(p= seq(1,NROW(parms)), cnt = accuracy2)

opt_p <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of parameter is", opt_p$p, "(accuracy :", opt_p$cnt,") in SVM")

library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
hchart(acc, 'line', hcaes(p, cnt)) %>%
  hc_title(text = "Accuracy With Varying Parameters (SVM)") %>%
  hc_subtitle(text = sub) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_xAxis(title = list(text = "Number of Parameters")) %>%
  hc_yAxis(title = list(text = "Accuracy"))
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_p$p], gamma=parms$gamma[opt_p$p])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       110         1
##   Malignant      1        59
##                                           
##                Accuracy : 0.9883          
##                  95% CI : (0.9584, 0.9986)
##     No Information Rate : 0.6491          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9743          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9910          
##             Specificity : 0.9833          
##          Pos Pred Value : 0.9910          
##          Neg Pred Value : 0.9833          
##              Prevalence : 0.6491          
##          Detection Rate : 0.6433          
##    Detection Prevalence : 0.6491          
##       Balanced Accuracy : 0.9872          
##                                           
##        'Positive' Class : Benign          
## 
Prediction Plot
col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(2,3))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Tune SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))

Select a best prediction model according to high accuracy

opt_predict <- c( cm_nb$overall[1],  cm_rp$overall[1],cm_rf$overall[1],cm_ada$overall[1],cm_svm$overall[1],cm_imp_svm$overall[1])
names(opt_predict) <- c("Naive Bayes","RPart","Random Forest","AdaBoost","SVM","SVM Tune")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
##  SVM Tune 
## 0.9883041